Minimal Character-level TensorFlow RNN model.

The following code is an adaptation to TensorFlow of the Minimal character-level Vanilla RNN model written by Andrej Karpathy.

The Unreasonable Effectiveness of Recurrent Neural Networks is a great source of inspiration to understand the power of RNN.

This notebook is for beginners who whant to understand RNN and the basis of TensorFlow by reading code.

More ressources:


In [ ]:
"""
Minimal character-level TensorFlow RNN model.
Original code written by Andrej Karpathy (@karpathy),  
adapted to TensorFlow by Damien Henry (@dh7net)
BSD License
"""
import numpy as np
import tensorflow as tf
tf.reset_default_graph() # Useful in Jupyter, to run the code several times

# data I/O
data = open('methamorphosis.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print 'data has %d characters, %d unique.' % (data_size, vocab_size)
char_to_ix = { ch:i for i,ch in enumerate(chars) } # to convert a char to an ID
ix_to_char = { i:ch for i,ch in enumerate(chars) } # to convert an ID back to a char

# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 0.002
decay_rate = 0.98 # 

# model parameters
Wxh = tf.Variable(tf.random_uniform((hidden_size, vocab_size))*0.01, name='Wxh') #input to hidden
Whh = tf.Variable(tf.random_uniform((hidden_size, hidden_size))*0.01, name='Whh')#hidden to hidden
Why = tf.Variable(tf.random_uniform((vocab_size, hidden_size))*0.01, name='Why') #hidden to output
bh = tf.Variable(tf.zeros((hidden_size, 1)), name='bh') # hidden bias
by = tf.Variable(tf.zeros((vocab_size, 1)), name='by') # output bias

# Define placeholder to for the input and the target & create the sequences
input_data = tf.placeholder(tf.float32, [seq_length, vocab_size], name='input_data')
xs = tf.split(0, seq_length, input_data)
target_data = tf.placeholder(tf.float32, [seq_length, vocab_size], name='target_data') 
targets = tf.split(0, seq_length, target_data)  
# initial_state & loss
initial_state = tf.zeros((hidden_size, 1))
loss = tf.zeros([1], name='loss')
# unroll recursion to create the forward pass graph
hs, ys, ps = {}, {}, {}
hs[-1] = initial_state                                                                                                                                                                             
for t in xrange(seq_length):
    xs_t = tf.transpose(xs[t])
    targets_t = tf.transpose(targets[t]) 
    hs[t] = tf.tanh(tf.matmul(Wxh, xs_t) + tf.matmul(Whh, hs[t-1]) + bh) # hidden state
    ys[t] = tf.matmul(Why, hs[t]) + by # unnormalized log probabilities for next chars
    ps[t] = tf.exp(ys[t]) / tf.reduce_sum(tf.exp(ys[t])) # probabilities for next chars
    loss += -tf.log(tf.reduce_sum(tf.mul(ps[t], targets_t))) # softmax (cross-entropy loss)

cost = loss / seq_length
final_state = hs[seq_length-1]
lr = tf.Variable(0.0, trainable=False, name='learning_rate')
tvars = tf.trainable_variables()
# Calculation of gradient is done by TensorFlow using "tf.gradients(cost, tvars)"
grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 5) # clip exploding gradients
optimizer = tf.train.AdamOptimizer(lr) 
train_op = optimizer.apply_gradients(zip(grads, tvars))

def sample(h, seed_ix, n):
  """ 
  sample a sequence of integers from the model 
  h is memory state, seed_ix is seed letter for first time step
  """
  x = np.zeros((vocab_size, 1))
  x[seed_ix] = 1
  ixes = []
  for t in xrange(n):
    h = np.tanh(np.dot(Wxh.eval(), x) + np.dot(Whh.eval(), h) + bh.eval())
    y = np.dot(Why.eval(), h) + by.eval()
    p = np.exp(y) / np.sum(np.exp(y))
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    x = np.zeros((vocab_size, 1))
    x[ix] = 1
    ixes.append(ix)
  return ixes

def vectorize(x): # take an array of IX and return an array of vector
    vectorized = np.zeros((len(x), vocab_size))
    for i in range(0, len(x)):
        vectorized[i][x[i]] = 1
    return vectorized

n, p, epoch = 0, 0, 0
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
with tf.Session() as sess:
    tf.initialize_all_variables().run()
    print "all variable initialized"
    while True:
        # prepare inputs (we're sweeping from left to right in steps seq_length long)
        if p+seq_length+1 >= len(data) or n == 0: 
            state = initial_state.eval() # reset RNN memory
            sess.run(tf.assign(lr, learning_rate * (decay_rate ** epoch)))
            p = 0 # go from start of data
            epoch += 1 # increase epoch number
        x = vectorize([char_to_ix[ch] for ch in data[p:p+seq_length]])
        y = vectorize([char_to_ix[ch] for ch in data[p+1:p+seq_length+1]])
        # Create the structure for the learning data
        feed = {input_data: x, target_data: y, initial_state: state}
        # Run a session using train_op
        [train_loss], state, _ = sess.run([cost, final_state, train_op], feed)
        smooth_loss = smooth_loss * 0.999 + train_loss * 0.001
        # sample from the model now and then
        if n % 1000 == 0:
            print 'iter %d, loss: %f' % (n, smooth_loss) # print progress
            sample_ix = sample(state, char_to_ix[data[p]], 200)
            txt = ''.join(ix_to_char[ix] for ix in sample_ix)
            print '----\n %s \n----' % (txt, )

        p += seq_length # move data pointer
        n += 1 # iteration counter

Feedback welcome @dh7net